#encoding=utf-8
import json

in_path = '../../data/mixtral8x22_generations/finetuning_data/finetuning_data_117.jsonl'
out_path = '../../data/mixtral8x22_generations/finetuning_data/finetuning_data_117_chinese.txt'
# {"prompt":"xxxx",  "completion":"yyyyy"}


def clear_str(str):
    if str.startswith('Task:'):
        str = str[5:]
    if str.startswith('Output:'):
        str = str[7:]
    if str.startswith(' Output:'):
        str = str[8:]
    if str.endswith('Output:'):
        str = str[:-7]
    str = str.rstrip()
    # 添加一个空行
    str += '\n'
    return str


# fin是utf-8编码的json，这里直接写入，可以显示汉字
with open(in_path, "r", encoding='utf-8') as fin:
    with open(out_path, "w", encoding='utf-8') as fout:
        lines = fin.readlines()
        count = 0
        for line in lines:
            count += 1
            data = json.loads(line)
            prompt = clear_str(data['prompt'])
            output = clear_str(data['completion'][:-13])  # 去掉<|endoftext|>
            fout.write(str(count) + "." + "Question:" + prompt + "Answer:" + output + "\n")
